Instruction:
if you are interested in the final model, run everything before Data Visualization, and then run “resample” and “create train and test set”, and finally “use the whole train data to train.”
The visualization of NYC map can make the R studio react slowly, hang in there. You can simply comment that specific chunk or skip that chunk to save some time.
Some of the chunks (most cross validation processes) might take 15-25 mins to finish running. If it takes longer than that, it’s highly potential that your R studio crashed. Restart the whole session and follow the same procedure.
# install these packages first
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(ggplot2)
library(dplyr)
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-4
library(tree)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(e1071)
library(rpart)
library(cluster)
library(cld2) # this would be helpful for detecting languages
library(caret) # this would be helpful for tuning parameters (not sure if we need it or not)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(pROC) # this can be helpful for ROC curve
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
# change the path to import the original data
airbnb <- read.csv("/Users/t.l./Desktop/DUKE MQM/Fall 2022/Data Science for Biz/Final Project/Airbnb_Open_Data.csv")
source("/Users/t.l./Desktop/DUKE MQM/Fall 2022/Data Science for Biz/Final Project/DataAnalyticsFunctions.R")
summary(airbnb)
## id NAME host.id
## Min. : 1001254 Length:102599 Min. :1.236e+08
## 1st Qu.:15085814 Class :character 1st Qu.:2.458e+10
## Median :29136603 Mode :character Median :4.912e+10
## Mean :29146235 Mean :4.925e+10
## 3rd Qu.:43201198 3rd Qu.:7.400e+10
## Max. :57367417 Max. :9.876e+10
##
## host_identity_verified host.name neighbourhood.group
## Length:102599 Length:102599 Length:102599
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## neighbourhood lat long country
## Length:102599 Min. :40.50 Min. :-74.25 Length:102599
## Class :character 1st Qu.:40.69 1st Qu.:-73.98 Class :character
## Mode :character Median :40.72 Median :-73.95 Mode :character
## Mean :40.73 Mean :-73.95
## 3rd Qu.:40.76 3rd Qu.:-73.93
## Max. :40.92 Max. :-73.71
## NA's :8 NA's :8
## country.code instant_bookable cancellation_policy room.type
## Length:102599 Mode :logical Length:102599 Length:102599
## Class :character FALSE:51474 Class :character Class :character
## Mode :character TRUE :51020 Mode :character Mode :character
## NA's :105
##
##
##
## Construction.year price service.fee minimum.nights
## Min. :2003 Length:102599 Length:102599 Min. :-1223.000
## 1st Qu.:2007 Class :character Class :character 1st Qu.: 2.000
## Median :2012 Mode :character Mode :character Median : 3.000
## Mean :2012 Mean : 8.136
## 3rd Qu.:2017 3rd Qu.: 5.000
## Max. :2022 Max. : 5645.000
## NA's :214 NA's :409
## number.of.reviews last.review reviews.per.month review.rate.number
## Min. : 0.00 Length:102599 Min. : 0.010 Min. :1.000
## 1st Qu.: 1.00 Class :character 1st Qu.: 0.220 1st Qu.:2.000
## Median : 7.00 Mode :character Median : 0.740 Median :3.000
## Mean : 27.48 Mean : 1.374 Mean :3.279
## 3rd Qu.: 30.00 3rd Qu.: 2.000 3rd Qu.:4.000
## Max. :1024.00 Max. :90.000 Max. :5.000
## NA's :183 NA's :15879 NA's :326
## calculated.host.listings.count availability.365 house_rules
## Min. : 1.000 Min. : -10.0 Length:102599
## 1st Qu.: 1.000 1st Qu.: 3.0 Class :character
## Median : 1.000 Median : 96.0 Mode :character
## Mean : 7.937 Mean : 141.1
## 3rd Qu.: 2.000 3rd Qu.: 269.0
## Max. :332.000 Max. :3677.0
## NA's :319 NA's :448
## license
## Length:102599
## Class :character
## Mode :character
##
##
##
##
# drop columns that we don't need
drop <- c("id", "host.id", "host.name", "country", "country.code", "license")
airbnb <- airbnb[,!(names(airbnb) %in% drop)]
ncol(airbnb) # from 26 columns to 23 columns
## [1] 20
# drop rows without review.rate.number
airbnb <- airbnb[-which(is.na(airbnb$review.rate.number)),]
nrow(airbnb) # from 102599 rows to 102273 rows
## [1] 102273
airbnb$NAME[airbnb$NAME == ""] <- NA
# airbnb <- airbnb[-which(is.na(airbnb$NAME)),]
airbnb$host_identity_verified[airbnb$host_identity_verified == ""] <- NA
airbnb$instant_bookable[airbnb$instant_bookable == ""] <- NA
airbnb$cancellation_policy[airbnb$cancellation_policy == ""] <- NA
airbnb$room.type[airbnb$room.type == ""] <- NA
airbnb$Construction.year[airbnb$Construction.year == ""] <- NA
airbnb$price[airbnb$price == ""] <- NA
airbnb$service.fee[airbnb$service.fee == ""] <- NA
airbnb$minimum.nights[airbnb$minimum.nights == ""] <- NA
airbnb$last.review[airbnb$last.review == ""] <- NA
airbnb$availability.365[airbnb$availability.365 == ""] <- NA
airbnb$house_rules[airbnb$house_rules == ""] <- NA
airbnb$calculated.host.listings.count[airbnb$calculated.host.listings.count == ""] <- NA
airbnb$review.rate.number[airbnb$review.rate.number == ""] <- NA
airbnb$reviews.per.month[airbnb$reviews.per.month == ""] <- NA
airbnb$neighbourhood[airbnb$neighbourhood == ""] <- NA
#
# nrow(airbnb) # from 102273 rows to 102032 rows
# this airbnb dataset can be used to tell which language has a better avg rating in EDA
unique(detect_language(airbnb$NAME, plain_text = TRUE))
## [1] "en" NA "ms" "fr" "pt" "no" "is"
## [8] "rw" "sk" "ro" "es" "nl" "cy" "tl"
## [15] "zh" "mg" "ja" "id" "zh-Hant" "ru" "ko"
## [22] "ny" "da" "de" "iw" "ka" "it" "hu"
## [29] "gd" "ga" "et" "af" "eu" "hr" "cs"
## [36] "ca" "lg" "ceb" "gl" "xx-Qaai" "ht"
airbnb$name_unknown <- ifelse(is.na(airbnb$NAME), 1, 0)
airbnb$name_en <- ifelse(detect_language(airbnb$NAME) == "en", 1, 0)
airbnb$name_cn <- ifelse((detect_language(airbnb$NAME) == "zh" | detect_language(airbnb$NAME) == "zh-Hant") , 1, 0)
airbnb$name_kr <- ifelse(detect_language(airbnb$NAME) == "ko", 1, 0)
airbnb$name_fr <- ifelse(detect_language(airbnb$NAME) == "fr", 1, 0)
airbnb$name_en[is.na(airbnb$name_en)] <- 0
airbnb$name_cn[is.na(airbnb$name_cn)] <- 0
airbnb$name_kr[is.na(airbnb$name_kr)] <- 0
airbnb$name_fr[is.na(airbnb$name_fr)] <- 0
# if these columns are all 0 in a row, then it implies name is in other languages
airbnb$Construction.year <- as.numeric(format(as.Date(ISOdate(airbnb$Construction.year,1,1)), "%Y"))
airbnb$last.review <- as.numeric(format(as.Date(airbnb$last.review,format="%m/%d/%Y"), "%Y")) # now as year
airbnb$price <- as.numeric(gsub(",", "", gsub("\\$", "", airbnb$price)))
airbnb$service.fee <- as.numeric(gsub(",", "", gsub("\\$", "", airbnb$service.fee)))
unique(airbnb$neighbourhood.group)
## [1] "Brooklyn" "Manhattan" "brookln" "manhatan"
## [5] "" "Queens" "Staten Island" "Bronx"
airbnb$neighbourhood.group <- sub("brookln", "Brooklyn", airbnb$neighbourhood.group)
airbnb$neighbourhood.group <- sub("manhatan", "Manhattan", airbnb$neighbourhood.group)
unique(airbnb$neighbourhood.group)
## [1] "Brooklyn" "Manhattan" "" "Queens"
## [5] "Staten Island" "Bronx"
# match neighbourhood with neighbourhood.dict
neighbourhood_dict <- data.frame(neighbourhood = unique(airbnb$neighbourhood)[!is.na(unique(airbnb$neighbourhood))])
for (i in 1:nrow(neighbourhood_dict)){
index <- which(airbnb$neighbourhood == neighbourhood_dict$neighbourhood[i])
correct.neighbourhood.group <- airbnb$neighbourhood.group[index][!is.na(airbnb$neighbourhood.group)]
correct.neighbourhood.group <- correct.neighbourhood.group[which(correct.neighbourhood.group!= "")][1]
neighbourhood_dict$neighbourhood.group[i] = correct.neighbourhood.group
}
# substitute "" with correct neighbourhood_dict
for (i in 1:length(airbnb$neighbourhood.group)) {
if (airbnb$neighbourhood.group[i] == "") {
neighbourhood <- airbnb$neighbourhood[i]
airbnb$neighbourhood.group[i] <- neighbourhood_dict$neighbourhood.group[neighbourhood_dict$neighbourhood == neighbourhood]
}
}
unique(airbnb$neighbourhood.group)
## [1] "Brooklyn" "Manhattan" "Queens" "Staten Island"
## [5] "Bronx"
sum(airbnb$last.review[airbnb$last.review > 2022],na.rm=TRUE)
## [1] 10173
airbnb$last.review <- ifelse(airbnb$last.review <= 2022, airbnb$last.review, 2022)
summary(airbnb)
## NAME host_identity_verified neighbourhood.group
## Length:102273 Length:102273 Length:102273
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## neighbourhood lat long instant_bookable
## Length:102273 Min. :40.50 Min. :-74.25 Mode :logical
## Class :character 1st Qu.:40.69 1st Qu.:-73.98 FALSE:51330
## Mode :character Median :40.72 Median :-73.95 TRUE :50852
## Mean :40.73 Mean :-73.95 NA's :91
## 3rd Qu.:40.76 3rd Qu.:-73.93
## Max. :40.92 Max. :-73.71
## NA's :8 NA's :8
## cancellation_policy room.type Construction.year price
## Length:102273 Length:102273 Min. :2003 Min. : 50.0
## Class :character Class :character 1st Qu.:2007 1st Qu.: 340.0
## Mode :character Mode :character Median :2012 Median : 625.0
## Mean :2012 Mean : 625.4
## 3rd Qu.:2017 3rd Qu.: 913.0
## Max. :2022 Max. :1200.0
## NA's :206 NA's :247
## service.fee minimum.nights number.of.reviews last.review
## Min. : 10 Min. :-1223.000 Min. : 0.00 Min. :2012
## 1st Qu.: 68 1st Qu.: 2.000 1st Qu.: 1.00 1st Qu.:2018
## Median :125 Median : 3.000 Median : 7.00 Median :2019
## Mean :125 Mean : 8.133 Mean : 27.42 Mean :2019
## 3rd Qu.:183 3rd Qu.: 5.000 3rd Qu.: 30.00 3rd Qu.:2019
## Max. :240 Max. : 5645.000 Max. :1024.00 Max. :2022
## NA's :273 NA's :389 NA's :182 NA's :15855
## reviews.per.month review.rate.number calculated.host.listings.count
## Min. : 0.010 Min. :1.000 Min. : 1.000
## 1st Qu.: 0.220 1st Qu.:2.000 1st Qu.: 1.000
## Median : 0.740 Median :3.000 Median : 1.000
## Mean : 1.373 Mean :3.279 Mean : 7.939
## 3rd Qu.: 2.000 3rd Qu.:4.000 3rd Qu.: 2.000
## Max. :90.000 Max. :5.000 Max. :332.000
## NA's :15841 NA's :251
## availability.365 house_rules name_unknown name_en
## Min. : -10 Length:102273 Min. :0.000000 Min. :0.0000
## 1st Qu.: 3 Class :character 1st Qu.:0.000000 1st Qu.:1.0000
## Median : 96 Mode :character Median :0.000000 Median :1.0000
## Mean : 141 Mean :0.002356 Mean :0.9264
## 3rd Qu.: 268 3rd Qu.:0.000000 3rd Qu.:1.0000
## Max. :3677 Max. :1.000000 Max. :1.0000
## NA's :434
## name_cn name_kr name_fr
## Min. :0.000000 Min. :0.0e+00 Min. :0.000000
## 1st Qu.:0.000000 1st Qu.:0.0e+00 1st Qu.:0.000000
## Median :0.000000 Median :0.0e+00 Median :0.000000
## Mean :0.002298 Mean :8.8e-05 Mean :0.001095
## 3rd Qu.:0.000000 3rd Qu.:0.0e+00 3rd Qu.:0.000000
## Max. :1.000000 Max. :1.0e+00 Max. :1.000000
##
# airbnb will be the dataset we use to explore different languages' impact on rating
# can also create graph based on lat and long to show where the apartments are
#unique(airbnb$host_identity_verified)
#unique(airbnb$neighbourhood.group)
#unique(airbnb$neighbourhood)
#unique(airbnb$cancellation_policy)
#unique(airbnb$room.type)
#unique(airbnb$house_rules)
airbnb$minimum.nights <- ifelse(airbnb$minimum.nights <= 1, 1,airbnb$minimum.nights) # change to 1 if smaller than 1
airbnb$minimum.nights <- ifelse(airbnb$minimum.nights >= 31, 31,airbnb$minimum.nights) # change to 31 if larger than 31
airbnb$availability.365 <- ifelse(airbnb$availability.365 <= 0, 0, airbnb$availability.365) # change to 0 if smaller than 0
airbnb$availability.365 <- ifelse(airbnb$availability.365 >= 365, 365, airbnb$availability.365) # change to 365 if larger than 365
# need to decide the question to ask so that we can drop useless columns
row <- c()
for (i in 1:nrow(airbnb)){
num_missing <- sum(is.na(airbnb[i,]))
if (num_missing > ncol(airbnb) * 0.5){ # set the threshold to be 0.5
row <- cbind(row, i)
}
}
row
## NULL
data_mean <- sapply(airbnb[,c(10:19,21:25)],median, na.rm=TRUE) # these numbers are the columns with numerical values
data_mean
## Construction.year price
## 2012.00 625.00
## service.fee minimum.nights
## 125.00 3.00
## number.of.reviews last.review
## 7.00 2019.00
## reviews.per.month review.rate.number
## 0.74 3.00
## calculated.host.listings.count availability.365
## 1.00 96.00
## name_unknown name_en
## 0.00 1.00
## name_cn name_kr
## 0.00 0.00
## name_fr
## 0.00
# need to fit na as shown in the following code
impute_data <- function(vec, mn) {
ifelse(is.na(vec), mn, vec)
}
for(i in c(10:19)) {
airbnb[,i]<-impute_data(airbnb[,i],data_mean[i-9])
}
for(i in c(21:25)){
airbnb[,i]<-impute_data(airbnb[,i],data_mean[i-9])
}
summary(airbnb)
## NAME host_identity_verified neighbourhood.group
## Length:102273 Length:102273 Length:102273
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## neighbourhood lat long instant_bookable
## Length:102273 Min. :40.50 Min. :-74.25 Mode :logical
## Class :character 1st Qu.:40.69 1st Qu.:-73.98 FALSE:51330
## Mode :character Median :40.72 Median :-73.95 TRUE :50852
## Mean :40.73 Mean :-73.95 NA's :91
## 3rd Qu.:40.76 3rd Qu.:-73.93
## Max. :40.92 Max. :-73.71
## NA's :8 NA's :8
## cancellation_policy room.type Construction.year price
## Length:102273 Length:102273 Min. :2003 Min. : 50.0
## Class :character Class :character 1st Qu.:2008 1st Qu.: 341.0
## Mode :character Mode :character Median :2012 Median : 625.0
## Mean :2012 Mean : 625.4
## 3rd Qu.:2017 3rd Qu.: 912.0
## Max. :2022 Max. :1200.0
##
## service.fee minimum.nights number.of.reviews last.review
## Min. : 10 Min. : 1.000 Min. : 0.00 Min. :2012
## 1st Qu.: 68 1st Qu.: 2.000 1st Qu.: 1.00 1st Qu.:2019
## Median :125 Median : 3.000 Median : 7.00 Median :2019
## Mean :125 Mean : 6.934 Mean : 27.38 Mean :2019
## 3rd Qu.:182 3rd Qu.: 5.000 3rd Qu.: 30.00 3rd Qu.:2019
## Max. :240 Max. :31.000 Max. :1024.00 Max. :2022
##
## reviews.per.month review.rate.number calculated.host.listings.count
## Min. : 0.010 Min. :1.000 Min. : 1.000
## 1st Qu.: 0.280 1st Qu.:2.000 1st Qu.: 1.000
## Median : 0.740 Median :3.000 Median : 1.000
## Mean : 1.275 Mean :3.279 Mean : 7.922
## 3rd Qu.: 1.710 3rd Qu.:4.000 3rd Qu.: 2.000
## Max. :90.000 Max. :5.000 Max. :332.000
##
## availability.365 house_rules name_unknown name_en
## Min. : 0 Length:102273 Min. :0.000000 Min. :0.0000
## 1st Qu.: 3 Class :character 1st Qu.:0.000000 1st Qu.:1.0000
## Median : 96 Mode :character Median :0.000000 Median :1.0000
## Mean :140 Mean :0.002356 Mean :0.9264
## 3rd Qu.:268 3rd Qu.:0.000000 3rd Qu.:1.0000
## Max. :365 Max. :1.000000 Max. :1.0000
##
## name_cn name_kr name_fr
## Min. :0.000000 Min. :0.0e+00 Min. :0.000000
## 1st Qu.:0.000000 1st Qu.:0.0e+00 1st Qu.:0.000000
## Median :0.000000 Median :0.0e+00 Median :0.000000
## Mean :0.002298 Mean :8.8e-05 Mean :0.001095
## 3rd Qu.:0.000000 3rd Qu.:0.0e+00 3rd Qu.:0.000000
## Max. :1.000000 Max. :1.0e+00 Max. :1.000000
##
airbnb$years.since.construction <- airbnb$last.review - airbnb$Construction.year
# airbnb$years.since.construction
# because there are duplicated rows containing same listings in different year
# not necessarily to be mentioned in the report
for (i in c(2:4,7:9)){
airbnb[,i][is.na(airbnb[,i])] <- names(which.max(table(airbnb[i])))
}
# in case we want it to be the most frequent word in that column
# names(which.max(table(airbnb[i])))
unique(airbnb$host_identity_verified)
## [1] "unconfirmed" "verified"
unique(airbnb$neighbourhood.group)
## [1] "Brooklyn" "Manhattan" "Queens" "Staten Island"
## [5] "Bronx"
#unique(airbnb$neighbourhood)
unique(airbnb$instant_bookable)
## [1] "FALSE" "TRUE"
unique(airbnb$cancellation_policy)
## [1] "strict" "moderate" "flexible"
unique(airbnb$room.type)
## [1] "Private room" "Entire home/apt" "Shared room" "Hotel room"
#unique(airbnb$house_rules)
airbnb$high.rating <- as.factor(ifelse(airbnb$review.rate.number == 5, 1, 0))
unique(airbnb$high.rating)
## [1] 0 1
## Levels: 0 1
# write.csv(airbnb, "/Users/t.l./Desktop/DUKE MQM/Fall 2022/Data Science for Biz/Final Project/cleaned_airbnb.csv")
airbnb %>%
ggplot(aes(x = Construction.year)) + geom_bar() + labs(x = 'Construction Year', y = 'Total Constructed', title = 'Total Amount of Airbnbs by Construction Year')
airbnb %>%
ggplot(aes(x = room.type, y = price)) + geom_boxplot() + labs(x = 'Type of Room', y = 'Price', title = 'Airbnb Price by Room Type')
airbnb %>%
ggplot(aes(x = price, fill = review.rate.number)) + geom_histogram(binwidth =25) + facet_wrap(~review.rate.number) + labs(x = 'Price', y = 'Count of Airbnbs', title = 'Total Amount of Airbnbs by Price and Review' , fill = "Rating")
airbnb %>%
ggplot(aes(x = review.rate.number, fill = room.type)) + geom_bar(position = "fill") + labs(x = 'Rating', y = 'Proportion', title = 'Proportion of Airbnb Rental Type by Rating', fill = 'Type of Rental')
airbnb %>%
ggplot(aes(x = review.rate.number, fill = room.type)) + geom_bar() + labs(x = 'Rating', y = 'Count of Ratings', title = 'Total amount of Airbnb Rental Type by Rating', fill = 'Type of Rental')
airbnb %>%
ggplot(aes(x = review.rate.number, fill = neighbourhood.group)) + geom_bar() + labs(x = 'Rating', y = 'Count of Ratings', title = 'Total amount of Airbnb Rental Type by Neighbourhood Group', fill = 'Neighbourhood Group')
# probably comment this if you don't want to run this over and over again
# it can take some storage space
library(leaflet)
df <- data.frame(lat = airbnb$lat, long = airbnb$long)
leaflet(df) %>%
addTiles() %>%
setView(-74.00, 40.71, zoom = 12) %>%
addProviderTiles("CartoDB.Positron") %>%
addCircles(data = df, weight =0)
## Warning in validateCoords(lng, lat, funcName): Data contains 8 rows with either
## missing or invalid lat/lon values and will be ignored
library(leaflet)
df <- data.frame(lat = airbnb$lat, long = airbnb$long, col = airbnb$high.rating)
pal <- colorFactor(
palette = c('blue', 'red'),
domain = df$col
)
leaflet(df) %>%
addTiles() %>%
setView(-74.00, 40.71, zoom = 12) %>%
addProviderTiles("CartoDB.Positron") %>%
addCircles(data = df, weight =0, color = ~pal(col))
## Assuming "long" and "lat" are longitude and latitude, respectively
## Warning in validateCoords(lng, lat, funcName): Data contains 8 rows with either
## missing or invalid lat/lon values and will be ignored
drop_cl <- c("NAME", "num.of.reviews", "last.review", "reviews.per.month",
"review.rate.number","calculated.host.listings.count", "house_rules",
"number.of.reviews")
airbnb_cl <- airbnb[,!(names(airbnb) %in% drop_cl)]
set.seed(1)
airbnb_cl <- airbnb_cl %>%
nest_by(neighbourhood, host_identity_verified, instant_bookable, cancellation_policy,
room.type, .key = "xy") %>%
mutate(sample = list(xy[sample(1:nrow(xy),
size = round(0.01*nrow(xy))),])) %>%
select(-xy) %>%
summarize(sample)
## `summarise()` has grouped output by 'neighbourhood', 'host_identity_verified',
## 'instant_bookable', 'cancellation_policy', 'room.type'. You can override using
## the `.groups` argument.
airbnb_cl$host_identity_verified <- as.factor(airbnb_cl$host_identity_verified)
airbnb_cl$neighbourhood <- as.factor(airbnb_cl$neighbourhood)
airbnb_cl$instant_bookable <- as.factor(airbnb_cl$instant_bookable)
airbnb_cl$cancellation_policy <- as.factor(airbnb_cl$cancellation_policy)
airbnb_cl$room.type <- as.factor(airbnb_cl$room.type)
airbnb_cl$neighbourhood.group <- as.factor(airbnb_cl$neighbourhood.group)
airbnb_cl$name_cn <- as.factor(airbnb_cl$name_cn)
airbnb_cl$name_en <- as.factor(airbnb_cl$name_en)
airbnb_cl$name_kr <- as.factor(airbnb_cl$name_kr)
airbnb_cl$name_fr <- as.factor(airbnb_cl$name_fr)
airbnb_cl$name_unknown <- as.factor(airbnb_cl$name_unknown)
dissimilarity <- daisy(airbnb_cl, metric = c("gower"))
hc<-hclust(dissimilarity, method = "complete")
plot(hc, labels=FALSE)
rect.hclust(hc, k=5, border="red")
cluster<-cutree(hc, k=5)
airbnb_cl$cluster <- as.factor(cluster)
ggplot(data = airbnb_cl, aes(x = cluster, fill = neighbourhood.group)) +
geom_bar(position = "fill") + ylab("proportion") +
stat_count(geom = "text",
aes(label = stat(count)),
position=position_fill(vjust=0.6), colour="white")
ggplot(data = airbnb_cl, aes(x = cluster, fill = host_identity_verified)) +
geom_bar(position = "fill") + ylab("proportion") +
stat_count(geom = "text",
aes(label = stat(count)),
position=position_fill(vjust=0.6), colour="white")
ggplot(data = airbnb_cl, aes(x = cluster, fill = instant_bookable)) +
geom_bar(position = "fill") + ylab("proportion") +
stat_count(geom = "text",
aes(label = stat(count)),
position=position_fill(vjust=0.6), colour="white")

ggplot(data = airbnb_cl, aes(x = cluster, fill = cancellation_policy)) +
geom_bar(position = "fill") + ylab("proportion") +
stat_count(geom = "text",
aes(label = stat(count)),
position=position_fill(vjust=0.6), colour="white")
ggplot(data = airbnb_cl, aes(x = cluster, fill = room.type)) +
geom_bar(position = "fill") + ylab("proportion") +
stat_count(geom = "text",
aes(label = stat(count)),
position=position_fill(vjust=0.6), colour="white")
ggplot(data = airbnb_cl, aes(x = cluster, y = price)) +
geom_boxplot(alpha = 0) +
geom_jitter(alpha = 0.5, width = 0.2, height = 0.2, color = "tomato")
ggplot(data = airbnb_cl, aes(x = cluster, y = Construction.year)) +
geom_boxplot(alpha = 0) +
geom_jitter(alpha = 0.5, width = 0.2, height = 0.2, color = "tomato")
df <- data.frame(lat = airbnb_cl$lat, long = airbnb_cl$long, col = airbnb_cl$cluster)
pal <- colorFactor(
palette = c("red","green", "yellow","blue", "pink"),
domain = df$col
)
leaflet(df) %>%
addTiles() %>%
setView(-74.00, 40.71, zoom = 12) %>%
addProviderTiles("CartoDB.Positron") %>%
addCircles(data = df, weight =3, color = ~pal(col))
## Assuming "long" and "lat" are longitude and latitude, respectively
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(wordcloud)
## Loading required package: RColorBrewer
#Create a vector containing only the text
text <- airbnb$house_rules
# Create a corpus
docs <- Corpus(VectorSource(text))
docs <- docs %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace)
## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops documents
## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents
docs <- tm_map(docs, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(docs, content_transformer(tolower)):
## transformation drops documents
docs <- tm_map(docs, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(docs, removeWords, stopwords("english")):
## transformation drops documents
docs<- tm_map(docs, stemDocument)
## Warning in tm_map.SimpleCorpus(docs, stemDocument): transformation drops
## documents
dtm <- TermDocumentMatrix(docs)
matrix <- as.matrix(dtm)
words <- sort(rowSums(matrix),decreasing=TRUE)
df <- data.frame(word = names(words),freq=words)
wordcloud(words = df$word, freq = df$freq, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))
drop2 <- c("NAME", "lat", "long", "num.of.reviews", "last.review", "reviews.per.month",
"review.rate.number","calculated.host.listings.count", "house_rules", "number.of.reviews")
airbnb <- airbnb[,!(names(airbnb) %in% drop2)] # drop more that we don't need
ncol(airbnb) # from 27 to 18 columns
## [1] 18
colnames(airbnb)
## [1] "host_identity_verified" "neighbourhood.group"
## [3] "neighbourhood" "instant_bookable"
## [5] "cancellation_policy" "room.type"
## [7] "Construction.year" "price"
## [9] "service.fee" "minimum.nights"
## [11] "availability.365" "name_unknown"
## [13] "name_en" "name_cn"
## [15] "name_kr" "name_fr"
## [17] "years.since.construction" "high.rating"
unique(airbnb$high.rating)
## [1] 0 1
## Levels: 0 1
sum(airbnb$high.rating == 1)
## [1] 23369
sum(airbnb$high.rating == 0)
## [1] 78904
set.seed(5)
final_airbnb <- airbnb %>%
nest_by(neighbourhood, host_identity_verified, instant_bookable, cancellation_policy, room.type, .key = "xy") %>%
mutate(sample = list(xy[sample(1:nrow(xy),
size = round(0.5*nrow(xy))),])) %>%
select(-xy) %>%
summarize(sample)
## `summarise()` has grouped output by 'neighbourhood', 'host_identity_verified',
## 'instant_bookable', 'cancellation_policy', 'room.type'. You can override using
## the `.groups` argument.
summary(airbnb)
## host_identity_verified neighbourhood.group neighbourhood
## Length:102273 Length:102273 Length:102273
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## instant_bookable cancellation_policy room.type Construction.year
## Length:102273 Length:102273 Length:102273 Min. :2003
## Class :character Class :character Class :character 1st Qu.:2008
## Mode :character Mode :character Mode :character Median :2012
## Mean :2012
## 3rd Qu.:2017
## Max. :2022
## price service.fee minimum.nights availability.365
## Min. : 50.0 Min. : 10 Min. : 1.000 Min. : 0
## 1st Qu.: 341.0 1st Qu.: 68 1st Qu.: 2.000 1st Qu.: 3
## Median : 625.0 Median :125 Median : 3.000 Median : 96
## Mean : 625.4 Mean :125 Mean : 6.934 Mean :140
## 3rd Qu.: 912.0 3rd Qu.:182 3rd Qu.: 5.000 3rd Qu.:268
## Max. :1200.0 Max. :240 Max. :31.000 Max. :365
## name_unknown name_en name_cn name_kr
## Min. :0.000000 Min. :0.0000 Min. :0.000000 Min. :0.0e+00
## 1st Qu.:0.000000 1st Qu.:1.0000 1st Qu.:0.000000 1st Qu.:0.0e+00
## Median :0.000000 Median :1.0000 Median :0.000000 Median :0.0e+00
## Mean :0.002356 Mean :0.9264 Mean :0.002298 Mean :8.8e-05
## 3rd Qu.:0.000000 3rd Qu.:1.0000 3rd Qu.:0.000000 3rd Qu.:0.0e+00
## Max. :1.000000 Max. :1.0000 Max. :1.000000 Max. :1.0e+00
## name_fr years.since.construction high.rating
## Min. :0.000000 Min. :-9.000 0:78904
## 1st Qu.:0.000000 1st Qu.: 2.000 1:23369
## Median :0.000000 Median : 7.000
## Mean :0.001095 Mean : 6.509
## 3rd Qu.:0.000000 3rd Qu.:11.000
## Max. :1.000000 Max. :19.000
summary(final_airbnb)
## neighbourhood host_identity_verified instant_bookable
## Length:50696 Length:50696 Length:50696
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## cancellation_policy room.type neighbourhood.group Construction.year
## Length:50696 Length:50696 Length:50696 Min. :2003
## Class :character Class :character Class :character 1st Qu.:2008
## Mode :character Mode :character Mode :character Median :2013
## Mean :2013
## 3rd Qu.:2018
## Max. :2022
## price service.fee minimum.nights availability.365
## Min. : 50.0 Min. : 10.0 Min. : 1.000 Min. : 0.0
## 1st Qu.: 344.0 1st Qu.: 69.0 1st Qu.: 2.000 1st Qu.: 3.0
## Median : 627.0 Median :125.0 Median : 3.000 Median : 96.0
## Mean : 627.9 Mean :125.6 Mean : 6.989 Mean :139.6
## 3rd Qu.: 915.0 3rd Qu.:183.0 3rd Qu.: 5.000 3rd Qu.:267.0
## Max. :1200.0 Max. :240.0 Max. :31.000 Max. :365.0
## name_unknown name_en name_cn name_kr
## Min. :0.000000 Min. :0.0000 Min. :0.000000 Min. :0.00e+00
## 1st Qu.:0.000000 1st Qu.:1.0000 1st Qu.:0.000000 1st Qu.:0.00e+00
## Median :0.000000 Median :1.0000 Median :0.000000 Median :0.00e+00
## Mean :0.002268 Mean :0.9272 Mean :0.002268 Mean :9.86e-05
## 3rd Qu.:0.000000 3rd Qu.:1.0000 3rd Qu.:0.000000 3rd Qu.:0.00e+00
## Max. :1.000000 Max. :1.0000 Max. :1.000000 Max. :1.00e+00
## name_fr years.since.construction high.rating
## Min. :0.000000 Min. :-9.000 0:39192
## 1st Qu.:0.000000 1st Qu.: 2.000 1:11504
## Median :0.000000 Median : 6.000
## Mean :0.001085 Mean : 6.473
## 3rd Qu.:0.000000 3rd Qu.:11.000
## Max. :1.000000 Max. :19.000
# this is to make sure that most neighbourhood appear in both train and test set
set.seed(1)
mydf <- final_airbnb %>%
mutate(all = paste(neighbourhood)) %>%
group_by(all) %>%
summarise(total=n()) %>%
filter(total>=2)
final_airbnb2 <- final_airbnb[paste(final_airbnb$neighbourhood) %in% mydf$all,]
set.seed(1)
IND_TRAIN <- createDataPartition(paste(final_airbnb2$neighbourhood), p = 0.75)$Resample
#train set
train <- final_airbnb2[ IND_TRAIN,]
#test set
test <- final_airbnb2[-IND_TRAIN,]
#write.csv(train, "/Users/t.l./Desktop/DUKE MQM/Fall 2022/Data Science for Biz/Final Project/train.csv")
#write.csv(test, "/Users/t.l./Desktop/DUKE MQM/Fall 2022/Data Science for Biz/Final Project/test.csv")
#sample<-sample(c(TRUE,FALSE), nrow(final_airbnb), replace = TRUE, prob = c(0.7, 0.3))
#train<-final_airbnb[sample,]
#test<-final_airbnb[!sample,]
#trainIndex <- createDataPartition(final_airbnb$high.rating, p = .7,
# list = FALSE,
# times = 1)
#train <- final_airbnb[ trainIndex,]
#test <- final_airbnb[-trainIndex,]
mean(train$high.rating == 1)
## [1] 0.2264364
mean(test$high.rating == 1)
## [1] 0.2284897
# it's around 22/23 vs. 77/78 in both train and test :)
neighbor <- c()
for (i in 1:length(unique(test$neighbourhood))){
test_neighbor_i <- unique(test$neighbourhood)[i]
if (test_neighbor_i %in% unique(train$neighbourhood) == FALSE){
neighbor <- cbind(neighbor, test_neighbor_i)
}
}
neighbor
## NULL
# if returns nothing, then all levels of neighbourhood in test set is contained in train set.
idx <- which(names(train) == "neighbourhood")
train<- train[,-idx]
why do we exclude the neighborhood? - rank deficient fit may be misleading - new levels in factor(neighborhood) in cross validation
Mx<- model.matrix(high.rating ~ ., data=train)[,-1]
My<- train$high.rating == 1
lasso <- glmnet(Mx,My,family="binomial")
lassoCV <- cv.glmnet(Mx,My,family="binomial")
par(mar=c(1.5,1.5,2,1.5))
par(mai=c(1.5,1.5,2,1.5))
plot(lassoCV, main="Fitting Graph for CV Lasso \n \n # of non-zero coefficients ", xlab = expression(paste("log(",lambda,")")))
#### the deviance of lasso does not necessarily change as much
#### probably explain why in our paper?
num.features <- ncol(Mx)
num.n <- nrow(Mx)
num.rating <- sum(My)
w <- (num.rating/num.n)*(1-(num.rating/num.n))
#### For the binomial case, a theoretically valid choice is
lambda.theory <- sqrt(w*log(num.features/0.05)/num.n)
lassoTheory <- glmnet(Mx,My, family="binomial",lambda = lambda.theory)
length(support(lassoTheory$beta)) # lasso theory doesn't work because it takes 0 coefficient
## [1] 1
#### Post Lasso #####
features.min <- support(lasso$beta[,which.min(lassoCV$cvm)])
length(features.min)
## [1] 1
features.1se <- support(lasso$beta[,which.min( (lassoCV$lambda-lassoCV$lambda.1se)^2)])
length(features.1se)
## [1] 0
features.theory <- support(lassoTheory$beta)
length(features.theory)
## [1] 1
data.min <- data.frame(Mx[,features.min],My)
data.1se <- data.frame(Mx[,features.1se],My) # PL.1se does not work
data.theory <- data.frame(Mx[,features.theory],My)
set.seed(1)
nfold <- 10
n = nrow(train)
foldid <- rep(1:nfold,each=ceiling(n/nfold))[sample(1:n)]
OOS <- data.frame(m.lr=rep(NA,nfold),
m.lr.lasso.min=rep(NA,nfold), m.lr.lasso.1se=rep(NA,nfold), m.lr.lasso.theory=rep(NA,nfold),
m.lr.pl.min=rep(NA,nfold), #m.lr.pl.1se=rep(NA,nfold),
m.lr.pl.theory=rep(NA,nfold),
m.tree=rep(NA,nfold), m.rf = rep(NA,nfold), m.average=rep(NA,nfold))
PerformanceMeasure <- function(actual, prediction, threshold=.5) {
1-mean( abs( (prediction>threshold) - actual ) ) #accuracy
#R2(y=actual, pred=prediction, family="binomial")
#1-mean( abs( (prediction- actual) ) )
}
# :( takes extreeeeeeemely long time to run
# kindly wait for 10 minutes please...
for(k in 1:nfold){
cv_train <- which(foldid!=k) # train on all but fold `k'
actual <- as.numeric(as.character(train$high.rating[-cv_train]))
### Logistic regression
m.lr <- glm(high.rating == 1 ~., data=train, subset=cv_train, family="binomial")
pred.lr <- predict(m.lr, newdata=train[-cv_train,], type="response")
OOS$m.lr <- PerformanceMeasure(actual= actual, pred=pred.lr)
### the Lasso estimates min
m.lr.l.min <- glmnet(Mx[cv_train,],My[cv_train], family="binomial",lambda = lassoCV$lambda.min)
pred.lr.l.min <- predict(m.lr.l.min, newx=Mx[-cv_train,], type="response")
OOS$m.lr.lasso.min[k] <- PerformanceMeasure(actual=My[-cv_train], prediction=pred.lr.l.min)
### the Lasso estimates 1se
m.lr.l.1se <- glmnet(Mx[cv_train,],My[cv_train], family="binomial",lambda = lassoCV$lambda.1se)
pred.lr.l.1se <- predict(m.lr.l.1se, newx=Mx[-cv_train,], type="response")
OOS$m.lr.lasso.1se[k] <- PerformanceMeasure(actual=My[-cv_train], prediction=pred.lr.l.1se)
### the Lasso estimates theory
m.lr.l.theory <- glmnet(Mx[cv_train,],My[cv_train], family="binomial",lambda = lambda.theory)
pred.lr.l.theory <- predict(m.lr.l.theory, newx=Mx[-cv_train,], type="response")
OOS$m.lr.lasso.theory[k] <- PerformanceMeasure(actual=My[-cv_train], prediction=pred.lr.l.theory)
### This is the CV for the Post Lasso Estimates
rmin <- glm(My~., data=data.min, subset=cv_train, family="binomial")
#if ( length(features.1se) == 0){ r1se <- glm(high.rating~1, data=train, subset=cv_train, family="binomial")
#} else {r1se <- glm(My~., data=data.1se, subset=cv_train, family="binomial")
#}
if ( length(features.theory) == 0){
rtheory <- glm(high.rating~1, data=train, subset=cv_train, family="binomial")
} else {rtheory <- glm(My~., data=data.theory, subset=cv_train, family="binomial") }
pred.lr.pl.min <- predict(rmin, newdata=data.min[-cv_train,], type="response")
#pred.lr.pl.1se <- predict(r1se, newdata=data.1se[-cv_train,], type="response")
pred.lr.pl.theory <- predict(rtheory, newdata=data.theory[-cv_train,], type="response")
OOS$m.lr.pl.min[k] <- PerformanceMeasure(actual=My[-cv_train], prediction=pred.lr.pl.min)
#OOS$m.lr.pl.1se[k] <- PerformanceMeasure(actual=My[-cv_train], prediction=pred.lr.pl.1se)
OOS$m.lr.pl.theory[k] <- PerformanceMeasure(actual=My[-cv_train], prediction=pred.lr.pl.theory)
### Classification tree
m.tree <- tree(as.factor(high.rating) ~ ., data=train, subset=cv_train )
pred.tree <- predict(m.tree, newdata = train[-cv_train,], type="vector")
pred.tree <- pred.tree[,2]
OOS$m.tree[k] <- PerformanceMeasure(actual=actual, pred=pred.tree)
#### tree does not work as we wish. because it's not that
### Random Forest
m.rf <- randomForest(high.rating ~ ., data=train, subset=cv_train)
pred.rf <- predict(m.rf, newdata=train[-cv_train,],"prob")[,2]
OOS$m.rf[k] <- PerformanceMeasure(actual=actual, pred=pred.rf)
# we would discuss that after cross validation
### optimized Random Forest
#m.rf.opt1 <- randomForest(high.rating ~ ., data=train, subset=cv_train, ntree = 300, mtry = 14)
#pred.rf.opt1 <- predict(m.rf.opt1, newdata=train[-cv_train,], "prob")[,2]
# use "prob" so that we got probability
#OOS$m.rf.opt1[k] <- PerformanceMeasure(actual=actual, pred=pred.rf.opt1)
pred.m.average <- rowMeans(cbind(pred.lr,pred.lr.l.min, pred.lr.l.1se, pred.lr.l.theory,
pred.lr.pl.min, pred.tree, pred.rf ))
OOS$m.average[k] <- PerformanceMeasure(actual=My[-cv_train], prediction=pred.m.average)
print(paste("Iteration",k,"of",nfold,"(thank you for your patience)"))
}
## Warning in tree(as.factor(high.rating) ~ ., data = train, subset = cv_train):
## NAs introduced by coercion
## Warning in pred1.tree(object, tree.matrix(newdata)): NAs introduced by coercion
## [1] "Iteration 1 of 10 (thank you for your patience)"
## Warning in tree(as.factor(high.rating) ~ ., data = train, subset = cv_train):
## NAs introduced by coercion
## Warning in tree(as.factor(high.rating) ~ ., data = train, subset = cv_train):
## NAs introduced by coercion
## [1] "Iteration 2 of 10 (thank you for your patience)"
## Warning in tree(as.factor(high.rating) ~ ., data = train, subset = cv_train):
## NAs introduced by coercion
## Warning in tree(as.factor(high.rating) ~ ., data = train, subset = cv_train):
## NAs introduced by coercion
## [1] "Iteration 3 of 10 (thank you for your patience)"
## Warning in tree(as.factor(high.rating) ~ ., data = train, subset = cv_train):
## NAs introduced by coercion
## Warning in tree(as.factor(high.rating) ~ ., data = train, subset = cv_train):
## NAs introduced by coercion
## [1] "Iteration 4 of 10 (thank you for your patience)"
## Warning in tree(as.factor(high.rating) ~ ., data = train, subset = cv_train):
## NAs introduced by coercion
## Warning in tree(as.factor(high.rating) ~ ., data = train, subset = cv_train):
## NAs introduced by coercion
## [1] "Iteration 5 of 10 (thank you for your patience)"
## Warning in tree(as.factor(high.rating) ~ ., data = train, subset = cv_train):
## NAs introduced by coercion
## Warning in tree(as.factor(high.rating) ~ ., data = train, subset = cv_train):
## NAs introduced by coercion
## [1] "Iteration 6 of 10 (thank you for your patience)"
## Warning in tree(as.factor(high.rating) ~ ., data = train, subset = cv_train):
## NAs introduced by coercion
## Warning in tree(as.factor(high.rating) ~ ., data = train, subset = cv_train):
## NAs introduced by coercion
## [1] "Iteration 7 of 10 (thank you for your patience)"
## Warning in tree(as.factor(high.rating) ~ ., data = train, subset = cv_train):
## NAs introduced by coercion
## Warning in tree(as.factor(high.rating) ~ ., data = train, subset = cv_train):
## NAs introduced by coercion
## [1] "Iteration 8 of 10 (thank you for your patience)"
## Warning in tree(as.factor(high.rating) ~ ., data = train, subset = cv_train):
## NAs introduced by coercion
## Warning in tree(as.factor(high.rating) ~ ., data = train, subset = cv_train):
## NAs introduced by coercion
## [1] "Iteration 9 of 10 (thank you for your patience)"
## Warning in tree(as.factor(high.rating) ~ ., data = train, subset = cv_train):
## NAs introduced by coercion
## Warning in tree(as.factor(high.rating) ~ ., data = train, subset = cv_train):
## NAs introduced by coercion
## [1] "Iteration 10 of 10 (thank you for your patience)"
OOS
# just out of curiosity, have a look at the confusion matrix in the last cross validation process
pred <- predict(m.rf, newdata=train[-cv_train,]) # m.rf in this case would be the 10th random forest in cross validation
sum(pred == 1 & actual == 1) # TP: 27
## [1] 28
sum(pred == 1 & actual == 0) # FN: 5
## [1] 4
sum(pred == 0 & actual == 1) # FP: 825
## [1] 824
sum(pred == 0 & actual == 0) # FN: 2952
## [1] 2953
par(mar=c(7,5,.5,1)+0.3)
barplot(colMeans(OOS), las=2,xpd=FALSE , xlab="", ylim=c(0.6,0.8), ylab = "")
# exclude the space kept for post lasso because it does not work (no coefficient chosen)
# random forest is slightly higher
# take extremely long time to run since it's a function with high computational cost within a for loop
# we want to tune hyperparameters, but it would be even more time consuming if we consider all different combinations
# so we choose ntree to be 300-600 and search for the best combinations
opt_rf_comb <- data.frame("ntree" = c(300,400,500,600), "mtry" = c(NA, NA, NA, NA), "OOBError" = c(NA, NA, NA, NA))
for (i in 1:nrow(opt_rf_comb)) {
# use n to traverse all "ntree" values listed above
n <- opt_rf_comb$ntree[i]
# tuneRF is a function to optimize mtry, which stops when the OOB error is no longer improved by 1e-4
bestmtry <- tuneRF(Mx, as.factor(My), mtryStart=5,step=0.9,ntreeTry = n,trace = TRUE,improve=1e-4)
# normally improve = 1e-5 but use 1e-4 here to save some time
bestmtry <- data.frame(bestmtry)
# find the minimized OOB Error and its corresponding mtry value for each loop
opt_rf_comb$mtry[i] <- bestmtry$mtry[which(bestmtry$OOBError == min(bestmtry$OOBError))][1]
opt_rf_comb$OOBError[i] <- min(bestmtry$OOBError)
}
## mtry = 5 OOB error = 21.89%
## Searching left ...
## mtry = 6 OOB error = 21.61%
## 0.01270983 1e-04
## mtry = 7 OOB error = 21.29%
## 0.01469517 1e-04
## mtry = 8 OOB error = 21.03%
## 0.0123259 1e-04
## mtry = 9 OOB error = 20.85%
## 0.008611007 1e-04
## mtry = 10 OOB error = 20.8%
## 0.002391742 1e-04
## mtry = 12 OOB error = 20.6%
## 0.009716088 1e-04
## mtry = 14 OOB error = 20.5%
## 0.004714577 1e-04
## mtry = 16 OOB error = 20.5%
## -0.0001280246 1e-04
## Searching right ...
## mtry = 4 OOB error = 22.49%
## -0.09717066 1e-04
## mtry = 5 OOB error = 21.98%
## Searching left ...
## mtry = 6 OOB error = 21.55%
## 0.01922847 1e-04
## mtry = 7 OOB error = 21.2%
## 0.01643936 1e-04
## mtry = 8 OOB error = 21.15%
## 0.002476167 1e-04
## mtry = 9 OOB error = 20.92%
## 0.01067395 1e-04
## mtry = 10 OOB error = 20.76%
## 0.007652741 1e-04
## mtry = 12 OOB error = 20.57%
## 0.009102402 1e-04
## mtry = 14 OOB error = 20.44%
## 0.006379178 1e-04
## mtry = 16 OOB error = 20.4%
## 0.00192604 1e-04
## mtry = 18 OOB error = 20.5%
## -0.005017368 1e-04
## Searching right ...
## mtry = 4 OOB error = 22.52%
## -0.1039496 1e-04
## mtry = 5 OOB error = 21.95%
## Searching left ...
## mtry = 6 OOB error = 21.53%
## 0.01925146 1e-04
## mtry = 7 OOB error = 21.22%
## 0.01426481 1e-04
## mtry = 8 OOB error = 20.99%
## 0.01100804 1e-04
## mtry = 9 OOB error = 20.86%
## 0.006253127 1e-04
## mtry = 10 OOB error = 20.7%
## 0.007676819 1e-04
## mtry = 12 OOB error = 20.61%
## 0.003931516 1e-04
## mtry = 14 OOB error = 20.48%
## 0.006493506 1e-04
## mtry = 16 OOB error = 20.39%
## 0.00461361 1e-04
## mtry = 18 OOB error = 20.39%
## -0.0003862495 1e-04
## Searching right ...
## mtry = 4 OOB error = 22.54%
## -0.1058324 1e-04
## mtry = 5 OOB error = 21.92%
## Searching left ...
## mtry = 6 OOB error = 21.51%
## 0.01856287 1e-04
## mtry = 7 OOB error = 21.19%
## 0.0147651 1e-04
## mtry = 8 OOB error = 20.93%
## 0.01226158 1e-04
## mtry = 9 OOB error = 20.79%
## 0.006645768 1e-04
## mtry = 10 OOB error = 20.68%
## 0.005427922 1e-04
## mtry = 12 OOB error = 20.43%
## 0.01231121 1e-04
## mtry = 14 OOB error = 20.48%
## -0.002827037 1e-04
## Searching right ...
## mtry = 4 OOB error = 22.52%
## -0.1022873 1e-04
opt_rf_comb
# generally performs best when mtry is in the range of 16
we want to explain why OOB Error can be a measure to tune hyperparameter and how it is different from OOS (need to google it)
note that here we could have investigate into details even better, but the roughly trend is already found. it’s just a sample of how we think we can search for the best hyper-parameters, while we did not necessarily find the best of the best. that is: 1) we only try ntree from 300 to 600 2) each step is probably not close enough e.g. we know that when n=400 OOBError_min is 20.48% with mtry at either 16 or 18, but we didnt try mtry = 17 3) typically we continue until improvement of OOBError is less than 1e-5 but we use 1e-4 as the criteria here 4) we can even test for the best threshold like target = 1 if p > 0.5/0.6/0.7, but we use defaulted value 5 here
Now that we have the optimized model, can we improve accuracy by adjusting threshold?
# wait patiently please. there are three random forest models. machine gets tired too ^v^
set.seed(1)
nfold <- 5 # it's a smaller number, because it takes extremely long time to finish one round of loop
n = nrow(train)
foldid <- rep(1:nfold,each=ceiling(n/nfold))[sample(1:n)]
OOS_opt <- data.frame(m.rf.ori = rep(NA,nfold), m.rf.400.50=rep(NA,nfold), m.rf.500.50=rep(NA,nfold), m.rf.600.50=rep(NA,nfold))
for(k in 1:nfold){
cv_train <- which(foldid!=k) # train on all but fold `k'
actual <- as.numeric(as.character(train$high.rating[-cv_train]))
# original random forest as the base model
m.rf <- randomForest(high.rating ~., data = train, subset = cv_train)
pred.rf <- predict(m.rf, newdata=train[-cv_train,],"prob")[,2]
OOS_opt$m.rf.ori[k] <- 1-mean( abs( (pred.rf > 0.5) - actual ) )
# different optimized random forest candidates
m.rf.400 <- randomForest(high.rating ~., data = train, subset = cv_train, mtry = 16, ntree = 400)
pred.rf.400 <- predict(m.rf.400, newdata=train[-cv_train,],"prob")[,2]
m.rf.500 <- randomForest(high.rating ~., data = train, subset = cv_train, mtry = 16, ntree = 500)
pred.rf.500 <- predict(m.rf.500, newdata=train[-cv_train,],"prob")[,2]
m.rf.600 <- randomForest(high.rating ~., data = train, subset = cv_train, mtry = 16, ntree = 600)
pred.rf.600 <- predict(m.rf.600, newdata=train[-cv_train,],"prob")[,2]
# we can also tune the threshold as well
# we could have define a function in a for loop for simplicity
# but the point is, we got the same probability prediction, can we improve accuracy by changing threshold?
# and if we do, we would have to provide some explanation to justify it
OOS_opt$m.rf.400.45[k] <- 1-mean( abs( (pred.rf.400 > 0.45) - actual ) )
OOS_opt$m.rf.400.50[k] <- 1-mean( abs( (pred.rf.400 > 0.5) - actual ) )
OOS_opt$m.rf.400.55[k] <- 1-mean( abs( (pred.rf.400 > 0.55) - actual ) )
OOS_opt$m.rf.400.60[k] <- 1-mean( abs( (pred.rf.400 > 0.6) - actual ) )
OOS_opt$m.rf.500.45[k] <- 1-mean( abs( (pred.rf.500 > 0.45) - actual ) )
OOS_opt$m.rf.500.50[k] <- 1-mean( abs( (pred.rf.500 > 0.5) - actual ) )
OOS_opt$m.rf.500.55[k] <- 1-mean( abs( (pred.rf.500 > 0.55) - actual ) )
OOS_opt$m.rf.500.60[k] <- 1-mean( abs( (pred.rf.500 > 0.6) - actual ) )
OOS_opt$m.rf.600.45[k] <- 1-mean( abs( (pred.rf.600 > 0.45) - actual ) )
OOS_opt$m.rf.600.50[k] <- 1-mean( abs( (pred.rf.600 > 0.5) - actual ) )
OOS_opt$m.rf.600.55[k] <- 1-mean( abs( (pred.rf.600 > 0.55) - actual ) )
OOS_opt$m.rf.600.60[k] <- 1-mean( abs( (pred.rf.600 > 0.6) - actual ) )
print(paste("Iteration",k,"of",nfold,"(thank you for your patience)"))
}
## [1] "Iteration 1 of 5 (thank you for your patience)"
## [1] "Iteration 2 of 5 (thank you for your patience)"
## [1] "Iteration 3 of 5 (thank you for your patience)"
## [1] "Iteration 4 of 5 (thank you for your patience)"
## [1] "Iteration 5 of 5 (thank you for your patience)"
OOS_opt
par(mar=c(7,5,.5,1)+0.3)
barplot(colMeans(OOS_opt), las=2,xpd=FALSE , xlab="", ylim=c(0.78,0.8), ylab = "")
# exclude the space kept for post lasso
# seems that set at mtry=16, ntree = 600, threshold = 0.5 makes more sense
set.seed(1)
m.rf.opt.final <- randomForest(high.rating ~ ., data=train, ntree = 600, mtry = 16)
pred.rf.opt.final <- predict(m.rf.opt.final, newdata=test, "prob")[,2]
# use "prob" so that we got probability
# since the original performance measure function set threshold to be 0.5, we would directly use the function
PerformanceMeasure(actual=as.numeric(as.character(test$high.rating)), pred=pred.rf.opt.final)
## [1] 0.7977278
test.pred <- predict(m.rf.opt.final, newdata=test)
test.pred <- as.numeric(as.character(test.pred))
# this is the 1 and 0 we finally want
sum(test.pred == 1 & test$high.rating == 1) # TP: 409
## [1] 410
sum(test.pred == 1 & test$high.rating == 0) # FN: 79
## [1] 79
sum(test.pred == 0 & test$high.rating == 1) # FP: 2467
## [1] 2466
sum(test.pred == 0 & test$high.rating == 0) # TN: 9632
## [1] 9632
roc_score=roc(test$high.rating, pred.rf.opt.final) #AUC-ROC score, you can print it if needed
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_score ,main ="ROC curve")
# please search for the best way to visualize the ROC score from the class script
varImpPlot(m.rf.opt.final)
feature_importance <- m.rf.opt.final$importance